This notebook will compare the results of the cross-validation simulation run with real data from Understanding Society, so that we can evaluate the transition models.
After generation of the final datasets (final_US), the respondents were separated into 2 roughly equal halves based on their pidp. One half was labelled as the transition dataset, and the other for simulation. Transition models were then fit using the transition dataset, with the other half then simulated with these models.
The results of the simulation will now be compared to the simulation dataset to see how closely our models mimick reality.
First things first, we need to make sure that we are comparing the exact same people from each dataset. Therefore, we need a step here to check the pidps from both, and remove anybody not in both datasets.
cv.pidps <- cv %>% select(pidp) %>% unique()
raw.pidps <- raw %>% select(pidp) %>% unique()
both.pidps <- intersect(cv.pidps, raw.pidps)
cv <- cv %>% filter(pidp %in% both.pidps$pidp)
raw <- raw %>% filter(pidp %in% both.pidps$pidp)
income.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'hh_income')
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(df1.name)
##
## # Now:
## data %>% select(all_of(df1.name))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(df2.name)
##
## # Now:
## data %>% select(all_of(df2.name))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
plot_mean_comparison(pivoted.df = income.pivoted,
var = 'hh_income',
group.var = 'scenario',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
compare_boxplot_long(pivoted.df = income.pivoted,
var = 'hh_income',
scen1 = 'raw',
scen2 = 'simulation')
compare_boxplot_long(pivoted.df = income.pivoted,
var = 'hh_income',
scen1 = 'raw',
scen2 = 'simulation',
subset.max = 5000,
subset.min = -5000)
income.t.tests <- two_sample_t_test(cv, 'cv', raw, 'raw', 'hh_income')
## [1] "T tests for year 2014 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = 0, df = 27492, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -30.10581 30.10581
## sample estimates:
## mean of x mean of y
## 1772.365 1772.365
##
## [1] "T tests for year 2015 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -9.5174, df = 24110, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -182.2242 -119.9858
## sample estimates:
## mean of x mean of y
## 1702.215 1853.320
##
## [1] "T tests for year 2016 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -4.6155, df = 24190, p-value = 3.942e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -100.24851 -40.48344
## sample estimates:
## mean of x mean of y
## 1734.291 1804.657
##
## [1] "T tests for year 2017 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -8.4422, df = 22268, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -163.9345 -102.1551
## sample estimates:
## mean of x mean of y
## 1700.317 1833.362
##
## [1] "T tests for year 2018 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -8.4251, df = 22516, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -168.2276 -104.7257
## sample estimates:
## mean of x mean of y
## 1718.076 1854.553
##
## [1] "T tests for year 2019 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -9.2848, df = 21272, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -191.1817 -124.5323
## sample estimates:
## mean of x mean of y
## 1744.799 1902.656
##
## [1] "T tests for year 2020 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -7.0212, df = 18986, p-value = 2.274e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -166.91567 -94.05975
## sample estimates:
## mean of x mean of y
## 1742.243 1872.730
income.t.tests
yearly_box_plots(pivoted.df = income.pivoted,
var = 'hh_income',
scen1 = 'raw',
scen2 = 'simulation',
subset.max = 5000,
subset.min = -5000)
sf12.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'SF_12')
plot_mean_comparison(pivoted.df = sf12.pivoted,
var = 'SF_12',
group.var = 'scenario',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
compare_boxplot_long(pivoted.df = sf12.pivoted,
var = 'SF_12',
scen1 = 'raw',
scen2 = 'simulation')
sf12.t.tests <- two_sample_t_test(cv, 'cv', raw, 'raw', 'hh_income')
## [1] "T tests for year 2014 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = 0, df = 27492, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -30.10581 30.10581
## sample estimates:
## mean of x mean of y
## 1772.365 1772.365
##
## [1] "T tests for year 2015 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -9.5174, df = 24110, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -182.2242 -119.9858
## sample estimates:
## mean of x mean of y
## 1702.215 1853.320
##
## [1] "T tests for year 2016 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -4.6155, df = 24190, p-value = 3.942e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -100.24851 -40.48344
## sample estimates:
## mean of x mean of y
## 1734.291 1804.657
##
## [1] "T tests for year 2017 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -8.4422, df = 22268, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -163.9345 -102.1551
## sample estimates:
## mean of x mean of y
## 1700.317 1833.362
##
## [1] "T tests for year 2018 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -8.4251, df = 22516, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -168.2276 -104.7257
## sample estimates:
## mean of x mean of y
## 1718.076 1854.553
##
## [1] "T tests for year 2019 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -9.2848, df = 21272, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -191.1817 -124.5323
## sample estimates:
## mean of x mean of y
## 1744.799 1902.656
##
## [1] "T tests for year 2020 :"
##
## Two Sample t-test
##
## data: tmp.df1[[var]] and tmp.df2[[var]]
## t = -7.0212, df = 18986, p-value = 2.274e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -166.91567 -94.05975
## sample estimates:
## mean of x mean of y
## 1742.243 1872.730
sf12.t.tests
yearly_box_plots(pivoted.df = sf12.pivoted,
var = 'SF_12',
scen1 = 'raw',
scen2 = 'simulation')
nut.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'nutrition_quality')
plot_mean_comparison(pivoted.df = nut.pivoted,
var = 'nutrition_quality',
group.var = 'scenario',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
compare_boxplot_long(pivoted.df = nut.pivoted,
var = 'nutrition_quality',
scen1 = 'raw',
scen2 = 'simulation')
yearly_box_plots(pivoted.df = nut.pivoted,
var = 'nutrition_quality',
scen1 = 'raw',
scen2 = 'simulation')
tobacco.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'ncigs')
plot_mean_comparison(pivoted.df = tobacco.pivoted,
var = 'ncigs',
group.var = 'scenario',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time'. You can override using the
## `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
compare_boxplot_long(pivoted.df = tobacco.pivoted,
var = 'ncigs',
scen1 = 'raw',
scen2 = 'simulation')
compare_boxplot_long(pivoted.df = tobacco.pivoted,
var = 'ncigs',
scen1 = 'raw',
scen2 = 'simulation',
subset.max = 100,
subset.min = 0)
yearly_box_plots(pivoted.df = tobacco.pivoted,
var = 'ncigs',
scen1 = 'raw',
scen2 = 'simulation',
subset.max = 100,
subset.min = 0)
hous.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'housing_quality')
cv_ordinal_plots(pivoted.df = hous.pivoted,
var = 'housing_quality',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time', 'scenario'. You can override using
## the `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
nhsafe.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'neighbourhood_safety')
cv_ordinal_plots(pivoted.df = nhsafe.pivoted,
var = 'neighbourhood_safety',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time', 'scenario'. You can override using
## the `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
lnly.pivoted <- combine_and_pivot_long(df1 = cv,
df1.name = 'simulated',
df2 = raw,
df2.name = 'raw',
var = 'loneliness')
cv_ordinal_plots(pivoted.df = lnly.pivoted,
var = 'loneliness',
save = TRUE,
save.path = '/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation/')
## `summarise()` has grouped output by 'time', 'scenario'. You can override using
## the `.groups` argument.
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file
## Warning in dir.create(dir, recursive = TRUE): cannot create dir '/home/luke',
## reason 'Operation not supported'
## Warning in normalizePath(dir):
## path[1]="/home/luke/Documents/WORK/MINOS/VALIDATION_PLOTS/Cross-Validation": No
## such file or directory
## Warning in grDevices::dev.off(): agg could not write to the given file